{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, sys\n",
    "sys.path.append('../')\n",
    "\n",
    "import datasets\n",
    "import pandas as pd\n",
    "import glob\n",
    "import string\n",
    "from collections import Counter\n",
    "from panlex_utils import load_panlex_resources, extract_monolingual_lexicon"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processing bug bug\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.bug-date=20221220,language=bug/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c4f6c3160e934898872b9a60baece3b3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processing mad mad\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.mad-date=20221220,language=mad/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "aef58207607b4654b03fd7b2bdd66290",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processing jv jav\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.jv-date=20221220,language=jv/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "af72b5da433c4b4aa7eb38c34a39559b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processing min min\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.min-date=20221220,language=min/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f42dcd3689ee4c89bf7c00eff3177966",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processing su sun\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.su-date=20221220,language=su/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "8a2e74a2c78344a79fca456f01258b28",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processing map-bms map-bms\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.map-bms-date=20221220,language=map-bms/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "33346b843f3e4867bfdf4b7efbc28210",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processing ace ace\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.ace-date=20221220,language=ace/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ebf3f0f321cd43439eb401ec14f41fa5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processing gor gor\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.gor-date=20221220,language=gor/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "d1fb565e36d141138222799919c01984",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processing ban ban\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.ban-date=20221220,language=ban/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "11109c345b8e41dd8478cc5c5bc0d1ad",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processing bjn bjn\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.bjn-date=20221220,language=bjn/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "44172bcefe254482a89a3a3d86ed7c2d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "processing nia nia\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Found cached dataset wikipedia (/home/samuel/.cache/huggingface/datasets/olm___wikipedia/20221220.nia-date=20221220,language=nia/2.0.0/dbfec0358f063ec7ae9e247d6559e2e505fbce7463e666024718863cbf199ec6)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ccec30595b3a4634908c7cddd272c894",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Load Wiki\n",
    "wiki_dsets = {}\n",
    "for lang in ['bug', 'mad', 'jv', 'min', 'su', 'map-bms', 'ace', 'gor', 'ban', 'bjn', 'nia']:\n",
    "    if lang == 'su':\n",
    "        iso_lang = 'sun'\n",
    "    elif lang == 'jv':\n",
    "        iso_lang = 'jav'\n",
    "    elif lang == 'id':\n",
    "        iso_lang = 'ind'\n",
    "    elif lang == 'en':\n",
    "        iso_lang = 'eng'\n",
    "    else:\n",
    "        iso_lang = lang\n",
    "    print(f'processing {lang} {iso_lang}')\n",
    "    wiki_dsets[iso_lang] = datasets.load_dataset('olm/wikipedia', language=lang, date=\"20221220\")['train'].to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Paragraph\n",
    "paragraph_dsets = {}\n",
    "for path in glob.glob('../data/nusa_alinea-paragraph-*.csv'):\n",
    "    _, _, lang, _ = path[:-4].split('/')[-1].split('-')\n",
    "\n",
    "    if lang not in paragraph_dsets:\n",
    "        paragraph_dsets[lang] = []\n",
    "    paragraph_dsets[lang].append(pd.read_csv(path))\n",
    "\n",
    "for lang, dfs in paragraph_dsets.items():\n",
    "    paragraph_dsets[lang] = pd.concat(dfs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load MT\n",
    "mt_dsets = {}\n",
    "for path in glob.glob('../data/nusa_kalimat-mt-*.csv'):\n",
    "    _, _, lang, _ = path[:-4].split('/')[-1].split('-')\n",
    "\n",
    "    if lang not in mt_dsets:\n",
    "        mt_dsets[lang] = []\n",
    "    mt_dsets[lang].append(pd.read_csv(path))\n",
    "\n",
    "for lang, dfs in mt_dsets.items():\n",
    "    mt_dsets[lang] = pd.concat(dfs)\n",
    "    mt_dsets[lang]['text'] = mt_dsets[lang]['tgt_text']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 31.4 s, sys: 303 ms, total: 31.7 s\n",
      "Wall time: 31.7 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "# Chunk & Count Text\n",
    "wiki_counters = {}\n",
    "paragraph_counters = {}\n",
    "mt_counters = {}\n",
    "\n",
    "all_langs = list(set(mt_dsets.keys()).union(set(paragraph_dsets.keys())).union(set(wiki_dsets.keys())))\n",
    "for lang in all_langs:\n",
    "    # Clean Text\n",
    "    replacement_rules = str.maketrans('', '', string.punctuation)\n",
    "    if lang in wiki_dsets:\n",
    "        wiki_dsets[lang]['clean_text'] = wiki_dsets[lang]['text'].apply(lambda x: x.lower().translate(replacement_rules).replace('\\n',' '))\n",
    "    if lang in paragraph_dsets:\n",
    "        paragraph_dsets[lang]['clean_text'] = paragraph_dsets[lang]['text'].apply(lambda x: x.lower().translate(replacement_rules).replace('\\n',' '))\n",
    "    if lang in mt_dsets:\n",
    "        mt_dsets[lang]['clean_text'] = mt_dsets[lang]['text'].apply(lambda x: x.lower().translate(replacement_rules).replace('\\n',' '))\n",
    "        \n",
    "    # Chunk & Count Text\n",
    "    wiki_counter = Counter()\n",
    "    paragraph_counter = Counter()\n",
    "    mt_counter = Counter()\n",
    "    \n",
    "    if lang in wiki_dsets:\n",
    "        for ct in wiki_dsets[lang]['clean_text']:\n",
    "            for word in ct.split(' '):\n",
    "                if len(word) > 0:\n",
    "                    wiki_counter[word] += 1\n",
    "\n",
    "    if lang in paragraph_dsets:\n",
    "        for ct in paragraph_dsets[lang]['clean_text']:\n",
    "            for word in ct.split(' '):\n",
    "                if len(word) > 0:\n",
    "                    paragraph_counter[word] += 1\n",
    "\n",
    "    if lang in mt_dsets:\n",
    "        for ct in mt_dsets[lang]['clean_text']:\n",
    "            for word in ct.split(' '):\n",
    "                if len(word) > 0:\n",
    "                    mt_counter[word] += 1\n",
    "                    \n",
    "    wiki_counters[lang] = wiki_counter\n",
    "    paragraph_counters[lang] = paragraph_counter\n",
    "    mt_counters[lang] = mt_counter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "### General Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mui 0 1474 1574\n",
      "jav 72737 10188 9449\n",
      "btk 0 4908 9449\n",
      "bjn 10122 0 0\n",
      "gor 14389 0 0\n",
      "min 226237 8608 9449\n",
      "bhp 0 0 1579\n",
      "map-bms 13575 0 0\n",
      "ace 12829 0 0\n",
      "bug 15866 1000 0\n",
      "mad 1002 5211 9449\n",
      "sun 61331 9594 9449\n",
      "nia 1620 0 0\n",
      "ban 17161 0 0\n",
      "rej 0 1200 1574\n",
      "abs 0 0 1574\n",
      "bew 0 9755 9449\n",
      "mak 0 5471 9449\n"
     ]
    }
   ],
   "source": [
    "# Number of document\n",
    "for lang in all_langs:\n",
    "    wiki_doc, para_doc, mt_doc = 0, 0, 0\n",
    "    \n",
    "    if lang in wiki_dsets:\n",
    "        wiki_doc = len(wiki_dsets[lang])\n",
    "    if lang in paragraph_dsets:\n",
    "        para_doc = len(paragraph_dsets[lang])\n",
    "    if lang in mt_dsets:\n",
    "        mt_doc = len(mt_dsets[lang])\n",
    "    \n",
    "    print(f'{lang} {wiki_doc} {para_doc} {mt_doc}')    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mui 0 182632 36233\n",
      "jav 8519436 1116678 208034\n",
      "btk 0 562947 214165\n",
      "bjn 680176 0 0\n",
      "gor 603073 0 0\n",
      "min 12623423 960961 211084\n",
      "bhp 0 0 32988\n",
      "map-bms 548796 0 0\n",
      "ace 489036 0 0\n",
      "bug 287823 118392 0\n",
      "mad 106335 573833 211503\n",
      "sun 5514173 1111848 209492\n",
      "nia 246643 0 0\n",
      "ban 1691041 0 0\n",
      "rej 0 156900 34668\n",
      "abs 0 0 37816\n",
      "bew 0 1164932 209867\n",
      "mak 0 609262 191528\n"
     ]
    }
   ],
   "source": [
    "# Number of tokens\n",
    "for lang in all_langs:\n",
    "    wiki_num, para_num, mt_num = 0, 0, 0\n",
    "    \n",
    "    if lang in wiki_dsets:\n",
    "        wiki_num = sum(wiki_counters[lang].values())\n",
    "    if lang in paragraph_dsets:\n",
    "        para_num = sum(paragraph_counters[lang].values())\n",
    "    if lang in mt_dsets:\n",
    "        mt_num = sum(mt_counters[lang].values())\n",
    "    print(f'{lang} {wiki_num} {para_num} {mt_num}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mui 0.0 123.9023066485753 23.01969504447268\n",
      "jav 117.12657932001595 109.60718492343933 22.016509683564397\n",
      "btk 0.0 114.69987775061125 22.665361413906233\n",
      "bjn 67.19778699861688 0.0 0.0\n",
      "gor 41.912085620960454 0.0 0.0\n",
      "min 55.79734084168372 111.63580390334573 22.339295163509366\n",
      "bhp 0.0 0.0 20.89170360987967\n",
      "map-bms 40.42696132596685 0.0 0.0\n",
      "ace 38.119572842778084 0.0 0.0\n",
      "bug 18.140867263330392 118.392 0.0\n",
      "mad 106.12275449101796 110.11955478794857 22.383638480262462\n",
      "sun 89.90841499404868 115.88993120700438 22.17081172610858\n",
      "nia 152.24876543209876 0.0 0.0\n",
      "ban 98.53977040964979 0.0 0.0\n",
      "rej 0.0 130.75 22.025412960609913\n",
      "abs 0.0 0.0 24.025412960609913\n",
      "bew 0.0 119.41896463352127 22.21049846544608\n",
      "mak 0.0 111.36209102540668 20.269658164885172\n"
     ]
    }
   ],
   "source": [
    "# Number of tokens / document\n",
    "for lang in all_langs:\n",
    "    wiki_num, para_num, mt_num = 0, 0, 0\n",
    "    wiki_doc, para_doc, mt_doc = 1, 1, 1\n",
    "    \n",
    "    if lang in wiki_dsets:\n",
    "        wiki_num = sum(wiki_counters[lang].values())\n",
    "        wiki_doc = len(wiki_dsets[lang])\n",
    "    if lang in paragraph_dsets:\n",
    "        para_num = sum(paragraph_counters[lang].values())\n",
    "        para_doc = len(paragraph_dsets[lang])\n",
    "    if lang in mt_dsets:\n",
    "        mt_num = sum(mt_counters[lang].values())\n",
    "        mt_doc = len(mt_dsets[lang])\n",
    "    \n",
    "    print(f'{lang} {wiki_num / wiki_doc} {para_num / para_doc} {mt_num / mt_doc}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mui 0 17847 8546\n",
      "jav 483499 45190 28707\n",
      "btk 0 37239 33514\n",
      "bjn 82072 0 0\n",
      "gor 45223 0 0\n",
      "min 284476 46955 26980\n",
      "bhp 0 0 7003\n",
      "map-bms 57888 0 0\n",
      "ace 36606 0 0\n",
      "bug 17542 12909 0\n",
      "mad 23085 42443 34422\n",
      "sun 289629 47648 25831\n",
      "nia 25927 0 0\n",
      "ban 135264 0 0\n",
      "rej 0 9895 9061\n",
      "abs 0 0 5877\n",
      "bew 0 60834 28681\n",
      "mak 0 49036 35482\n"
     ]
    }
   ],
   "source": [
    "# Number of Unique Tokens\n",
    "for lang in all_langs:\n",
    "    wiki_len, para_len, mt_len = 0, 0, 0\n",
    "    \n",
    "    if lang in wiki_dsets:\n",
    "        wiki_len = len(wiki_counters[lang])\n",
    "    if lang in paragraph_dsets:\n",
    "        para_len = len(paragraph_counters[lang])\n",
    "    if lang in mt_dsets:\n",
    "        mt_len = len(mt_counters[lang])\n",
    "    print(f'{lang} {wiki_len} {para_len} {mt_len}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mui 0 0.9999439713133124 0.9998829998829999\n",
      "jav 483499 0.410258679825629 0.43667270447262085\n",
      "btk 0 0.9999731471535983 0.999970162613755\n",
      "bjn 82072 0.0 0.0\n",
      "gor 45223 0.0 0.0\n",
      "min 284476 0.4418817616492035 0.4901597420406953\n",
      "bhp 0 0.0 0.9998572244431754\n",
      "map-bms 57888 0.0 0.0\n",
      "ace 36606 0.0 0.0\n",
      "bug 17542 0.9061967467079783 0.0\n",
      "mad 23085 0.8601451324097634 0.8637538854835429\n",
      "sun 289629 0.38248441730151733 0.44758439145246204\n",
      "nia 25927 0.0 0.0\n",
      "ban 135264 0.0 0.0\n",
      "rej 0 0.9998989490703315 0.9998896490840874\n",
      "abs 0 0.0 0.9998298741068391\n",
      "bew 0 0.9999835620941891 0.9999651349278293\n",
      "mak 0 0.999979607235353 0.9999718174900657\n"
     ]
    }
   ],
   "source": [
    "# Percentage of novel words\n",
    "for lang in all_langs:\n",
    "    wiki_words, para_words, mt_words = set(), set(), set()\n",
    "\n",
    "    if lang in wiki_dsets:\n",
    "        wiki_words = set(wiki_counters[lang].keys())\n",
    "    if lang in paragraph_dsets:\n",
    "        para_words = set(paragraph_counters[lang].keys())\n",
    "    if lang in mt_dsets:\n",
    "        mt_words = set(mt_counters[lang].keys())\n",
    "\n",
    "    print('{} {} {} {}'.format(\n",
    "        lang, len(wiki_words), \n",
    "        len(para_words - wiki_words) / (len(para_words) + 1),\n",
    "        len(mt_words - wiki_words) / (len(mt_words) + 1)\n",
    "    ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mui 0 0.0 0.0\n",
      "jav 483499 0.9448790072388832 0.9665522233712512\n",
      "btk 0 0.0 0.0\n",
      "bjn 82072 0.9999878157250253 0.9999878157250253\n",
      "gor 45223 0.9999778878471608 0.9999778878471608\n",
      "min 284476 0.9078765594406578 0.9516445969269923\n",
      "bhp 0 0.0 0.0\n",
      "map-bms 57888 0.9999827255609874 0.9999827255609874\n",
      "ace 36606 0.9999726828202257 0.9999726828202257\n",
      "bug 17542 0.9309696175112581 0.9999429972068631\n",
      "mad 23085 0.742874469375379 0.7968465736810187\n",
      "sun 289629 0.8984083140558644 0.9507302420329385\n",
      "nia 25927 0.999961431656896 0.999961431656896\n",
      "ban 135264 0.9999926071045725 0.9999926071045725\n",
      "rej 0 0.0 0.0\n",
      "abs 0 0.0 0.0\n",
      "bew 0 0.0 0.0\n",
      "mak 0 0.0 0.0\n"
     ]
    }
   ],
   "source": [
    "# Percentage of novel words\n",
    "for lang in all_langs:\n",
    "    wiki_words, para_words, mt_words = set(), set(), set()\n",
    "\n",
    "    if lang in wiki_dsets:\n",
    "        wiki_words = set(wiki_counters[lang].keys())\n",
    "    if lang in paragraph_dsets:\n",
    "        para_words = set(paragraph_counters[lang].keys())\n",
    "    if lang in mt_dsets:\n",
    "        mt_words = set(mt_counters[lang].keys())\n",
    "\n",
    "    print('{} {} {} {}'.format(\n",
    "        lang, len(wiki_words), \n",
    "        len(wiki_words - para_words) / (len(wiki_words) + 1),\n",
    "        len(wiki_words - mt_words) / (len(wiki_words) + 1)\n",
    "    ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mui 0 0.0 0.0977205652866678 0.23585582601976043\n",
      "jav 483499 0.05675245911202818 0.040468209754101224 0.13799120340327348\n",
      "btk 0 0.0 0.06614998188109737 0.15648609022907464\n",
      "bjn 82072 0.12066270985346461 0.0 0.0\n",
      "gor 45223 0.07498748080666717 0.0 0.0\n",
      "min 284476 0.022535565627836 0.04886249404242832 0.1278158087974039\n",
      "bhp 0 0.0 0.0 0.2122828821728455\n",
      "map-bms 57888 0.1054816261750702 0.0 0.0\n",
      "ace 36606 0.07485323196404362 0.0 0.0\n",
      "bug 17542 0.0609469675913058 0.10903516255184006 0.0\n",
      "mad 23085 0.21709486909419198 0.07396389896729716 0.1627486950601407\n",
      "sun 289629 0.05252445787891351 0.04285474016705506 0.1233024492465142\n",
      "nia 25927 0.10511911905418336 0.0 0.0\n",
      "ban 135264 0.07998855143751604 0.0 0.0\n",
      "rej 0 0.0 0.06306524496338456 0.2613574086359572\n",
      "abs 0 0.0 0.0 0.15540629875452838\n",
      "bew 0 0.0 0.05222102902055311 0.13666209236281854\n",
      "mak 0 0.0 0.08048412590293519 0.1852565407849464\n"
     ]
    }
   ],
   "source": [
    "# Percentage of Unique Word\n",
    "for lang in all_langs:\n",
    "    wiki_words, para_words, mt_words = set(), set(), set()\n",
    "\n",
    "    if lang in wiki_dsets:\n",
    "        wiki_words = set(wiki_counters[lang].keys())\n",
    "    if lang in paragraph_dsets:\n",
    "        para_words = set(paragraph_counters[lang].keys())\n",
    "    if lang in mt_dsets:\n",
    "        mt_words = set(mt_counters[lang].keys())\n",
    "        \n",
    "    print('{} {} {} {} {}'.format(\n",
    "        lang, len(wiki_words), \n",
    "        len(wiki_words) / (sum(list(wiki_counters[lang].values())) + 1),\n",
    "        len(para_words) / (sum(list(paragraph_counters[lang].values())) + 1),\n",
    "        len(mt_words) / (sum(list(mt_counters[lang].values())) + 1)\n",
    "    ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Percentage of Unique Word\n",
    "for lang in all_langs:\n",
    "    data = {}\n",
    "    if lang in wiki_dsets:\n",
    "        data['wiki_top'] = list(map(lambda x: x[0], wiki_counters[lang].most_common(100)))\n",
    "        data['wiki_cnt'] = list(map(lambda x: x[1], wiki_counters[lang].most_common(100)))\n",
    "    if lang in paragraph_dsets:\n",
    "        data['para_top'] = list(map(lambda x: x[0], paragraph_counters[lang].most_common(100)))\n",
    "        data['para_cnt'] = list(map(lambda x: x[1], paragraph_counters[lang].most_common(100)))       \n",
    "    if lang in mt_dsets:\n",
    "        data['mt_top'] = list(map(lambda x: x[0], mt_counters[lang].most_common(100)))\n",
    "        data['mt_cnt'] = list(map(lambda x: x[1], mt_counters[lang].most_common(100)))\n",
    "    pd.DataFrame(data).to_csv(f'{lang}_top100.csv', index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Lexicon Overlapping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1min 24s, sys: 4.98 s, total: 1min 29s\n",
      "Wall time: 1min 29s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "# Load Lexicon Indonesia & English\n",
    "if os.path.exists('ind_lexicon.zip') and os.path.exists('eng_lexicon.zip'):\n",
    "    ind_lexicon = pd.read_pickle('ind_lexicon.zip')\n",
    "    eng_lexicon = pd.read_pickle('eng_lexicon.zip')\n",
    "else:\n",
    "    langvar_df, expr_df, deno_df = load_panlex_resources('../resources/panlex-20230501-csv')\n",
    "    \n",
    "    ind_lexicon = extract_monolingual_lexicon('ind', langvar_df, expr_df)\n",
    "    eng_lexicon = extract_monolingual_lexicon('eng', langvar_df, expr_df)\n",
    "    \n",
    "    ind_lexicon.to_pickle('ind_lexicon.zip')\n",
    "    eng_lexicon.to_pickle('eng_lexicon.zip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "ind_words, eng_words = set(), set()\n",
    "replacement_rules = str.maketrans('', '', string.punctuation)\n",
    "\n",
    "for word in ind_lexicon['ind'].values:\n",
    "    ind_words.add(word.lower().translate(replacement_rules))\n",
    "for word in eng_lexicon['eng'].values:\n",
    "    eng_words.add(word.lower().translate(replacement_rules))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mui ind 0 0.5640020898641588 0.5984481086323957\n",
      "mui eng 0 0.3014629049111808 0.4248302618816683\n",
      "jav ind 0.23096331608627405 0.41798082869511444 0.4376122082585278\n",
      "jav eng 0.43225393872449275 0.2627551020408163 0.3256283662477558\n",
      "btk ind 0 0.4394136422745005 0.4337637494021999\n",
      "btk eng 0 0.28915947511526185 0.3703969392635103\n",
      "bjn ind 0.4835850801479655 0 0\n",
      "bjn eng 0.39603884093711467 0 0\n",
      "gor ind 0.5395945945945946 0 0\n",
      "gor eng 0.41175675675675677 0 0\n",
      "min ind 0.2435402918486848 0.4165727170236753 0.44605858537747084\n",
      "min eng 0.33423180592991913 0.25696569495893057 0.3362705406049059\n",
      "bhp ind 0 0 0.41485714285714287\n",
      "bhp eng 0 0 0.5497142857142857\n",
      "map-bms ind 0.5681731137495213 0 0\n",
      "map-bms eng 0.3701646878590578 0 0\n",
      "ace ind 0.36175548589341694 0 0\n",
      "ace eng 0.42424242424242425 0 0\n",
      "bug ind 0.3902439024390244 0.31163130943672274 0\n",
      "bug eng 0.48267008985879334 0.2571324067300658 0\n",
      "mad ind 0.45634534242129177 0.30735489135978783 0.33533057851239667\n",
      "mad eng 0.4118792599805258 0.25563602978679995 0.2975206611570248\n",
      "sun ind 0.26061927438942756 0.4044528301886792 0.44783834586466165\n",
      "sun eng 0.43421883068449135 0.26543396226415095 0.30357142857142855\n",
      "nia ind 0.2744759023125135 0 0\n",
      "nia eng 0.2848497946833802 0 0\n",
      "ban ind 0.30079028527370855 0 0\n",
      "ban eng 0.34184656900539706 0 0\n",
      "rej ind 0 0.3541927409261577 0.38218714768883877\n",
      "rej eng 0 0.2390488110137672 0.3382187147688839\n",
      "abs ind 0 0 0.6616052060737527\n",
      "abs eng 0 0 0.5119305856832972\n",
      "bew ind 0 0.43850430744791347 0.42600513259195893\n",
      "bew eng 0 0.2163499725056516 0.26646706586826346\n",
      "mak ind 0 0.3655304848775804 0.2316447221602321\n",
      "mak eng 0 0.2730676908305329 0.26645837982593173\n"
     ]
    }
   ],
   "source": [
    "min_threshold = 5\n",
    "for lang in all_langs:\n",
    "    wiki_words, para_words, mt_words = set(), set(), set()\n",
    "\n",
    "    if lang in wiki_dsets:\n",
    "        wiki_counter = {x: count for x, count in wiki_counters[lang].items() if count >= min_threshold}\n",
    "        wiki_words = set(wiki_counter.keys())\n",
    "    if lang in paragraph_dsets:\n",
    "        paragraph_counter = {x: count for x, count in paragraph_counters[lang].items() if count >= min_threshold}\n",
    "        para_words = set(paragraph_counter.keys())\n",
    "    if lang in mt_dsets:\n",
    "        mt_counter = {x: count for x, count in mt_counters[lang].items() if count >= min_threshold}\n",
    "        mt_words = set(mt_counter.keys())\n",
    "\n",
    "    print('{} {} {} {} {}'.format(\n",
    "        lang, 'ind',\n",
    "        len(wiki_words.intersection(ind_words)) / len(wiki_words) if len(wiki_words) > 0 else 0,\n",
    "        len(para_words.intersection(ind_words)) / len(para_words) if len(para_words) > 0 else 0,\n",
    "        len(mt_words.intersection(ind_words)) / len(mt_words) if len(mt_words) > 0 else 0\n",
    "    ))\n",
    "\n",
    "    print('{} {} {} {} {}'.format(\n",
    "        lang, 'eng',\n",
    "        len(wiki_words.intersection(eng_words)) / len(wiki_words) if len(wiki_words) > 0 else 0,\n",
    "        len(para_words.intersection(eng_words)) / len(para_words) if len(para_words) > 0 else 0,\n",
    "        len(mt_words.intersection(eng_words)) / len(mt_words) if len(mt_words) > 0 else 0\n",
    "    ))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python (env_instruct_align)",
   "language": "python",
   "name": "env-instruct-align"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
